// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
// Copyright (c) 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// ---
// Unittest for the TCMalloc implementation.
//
// * The test consists of a set of threads.
// * Each thread maintains a set of allocated objects, with
//   a bound on the total amount of data in the set.
// * Each allocated object's contents are generated by
//   hashing the object pointer, and a generation count
//   in the object.  This allows us to easily check for
//   data corruption.
// * At any given step, the thread can do any of the following:
//     a. Allocate an object
//     b. Increment an object's generation count and update
//        its contents.
//     c. Pass the object to another thread
//     d. Free an object
//   Also, at the end of every step, object(s) are freed to maintain
//   the memory upper-bound.
//
#include "config_for_unittests.h"
// Complicated ordering requirements.  tcmalloc.h defines (indirectly)
// _POSIX_C_SOURCE, which it needs so stdlib.h defines posix_memalign.
// unistd.h, on the other hand, requires _POSIX_C_SOURCE to be unset,
// at least on FreeBSD, in order to define sbrk.  The solution
// is to #include unistd.h first.  This is safe because unistd.h
// doesn't sub-include stdlib.h, so we'll still get posix_memalign
// when we #include stdlib.h.  Blah.
#ifdef HAVE_UNISTD_H
#include <unistd.h>                 // for testing sbrk hooks
#endif
#include "tcmalloc_internal.h"      // must come early, to pick up posix_memalign
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdint.h>                 // for intptr_t
#include <sys/types.h>              // for size_t
#ifdef HAVE_FCNTL_H
#include <fcntl.h>                  // for open; used with mmap-hook test
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>                 // defines pvalloc/etc on cygwin
#endif
#include <assert.h>

#ifndef _WIN32
#include <spawn.h> // for posix_spawn
#include <sys/wait.h> // for waitpid
#endif

#include <algorithm>
#include <array>
#include <functional>
#include <iterator>
#include <mutex>
#include <new>
#include <sstream>
#include <string>
#include <thread>
#include <vector>

#if __linux__ && __x86_64__
// for fork testing
#include <errno.h>
#include <sched.h>
#include <semaphore.h>
#include <signal.h>
#include <sys/syscall.h>
#include <ucontext.h>
#include <unistd.h>

#define HAVE_FORK_TESTING_SUPPORT
#endif // __linux__ && __x86_64__

#include "gperftools/malloc_hook.h"
#include "gperftools/malloc_extension.h"
#include "gperftools/nallocx.h"
#include "gperftools/tcmalloc.h"

#include "base/environ.h"
#include "base/cleanup.h"
#include "base/function_ref.h"
#include "base/logging.h"
#include "base/static_storage.h"

#include "tests/testutil.h"

#include "testing_portal.h"

#include "gtest/gtest.h"



static bool running_fork_testing;

using tcmalloc::TestingPortal;

namespace {

// SetFlag updates given variable to new value and returns
// tcmalloc::Cleanup that restores it to previous value.
template <typename T, typename V>
decltype(auto) SetFlag(T* ptr, V value) {
  T old_value = *ptr;
  *ptr = value;
  return tcmalloc::Cleanup{[=] () {
    *ptr = old_value;
  }};
}

struct NumericProperty {
  const char* const name;

  constexpr NumericProperty(const char* name) : name(name) {}

  // Override sets this property to new value and returns
  // tcmalloc::Cleanup that returns it to previous setting.
  decltype(auto) Override(size_t new_value) const {
    MallocExtension *e = MallocExtension::instance();
    size_t old_value;

    CHECK(e->GetNumericProperty(name, &old_value));
    CHECK(e->SetNumericProperty(name, new_value));

    return tcmalloc::Cleanup{[old_value, name = name] () {
      CHECK(MallocExtension::instance()->SetNumericProperty(name, old_value));
    }};
  }
};

constexpr NumericProperty kAggressiveDecommit{"tcmalloc.aggressive_memory_decommit"};

}  // namespace

// Windows doesn't define pvalloc and a few other obsolete unix
// functions; nor does it define posix_memalign (which is not obsolete).
#if defined(_WIN32)
# define valloc malloc
# define pvalloc malloc
// I'd like to map posix_memalign to _aligned_malloc, but _aligned_malloc
// must be paired with _aligned_free (not normal free), which is too
// invasive a change to how we allocate memory here.  So just bail
static bool kOSSupportsMemalign = false;
static inline void* Memalign(size_t align, size_t size) {
  //LOG(FATAL) << "memalign not supported on windows";
  exit(1);
  return nullptr;
}
static inline int PosixMemalign(void** ptr, size_t align, size_t size) {
  //LOG(FATAL) << "posix_memalign not supported on windows";
  exit(1);
  return -1;
}

// OS X defines posix_memalign in some OS versions but not others;
// it's confusing enough to check that it's easiest to just not to test.
#elif defined(__APPLE__)
static bool kOSSupportsMemalign = false;
static inline void* Memalign(size_t align, size_t size) {
  //LOG(FATAL) << "memalign not supported on OS X";
  exit(1);
  return nullptr;
}
static inline int PosixMemalign(void** ptr, size_t align, size_t size) {
  //LOG(FATAL) << "posix_memalign not supported on OS X";
  exit(1);
  return -1;
}

#else
static bool kOSSupportsMemalign = true;
static inline void* Memalign(size_t align, size_t size) {
  return noopt(memalign(align, noopt(size)));
}
static inline int PosixMemalign(void** ptr, size_t align, size_t size) {
  return noopt(posix_memalign(ptr, align, noopt(size)));
}

#endif

static constexpr size_t kOveralignment = 64;

struct overaligned_type
{
  alignas(kOveralignment)
  unsigned char data[kOveralignment * 2]; // make the object size different from
                                          // alignment to make sure the correct
                                          // values are passed to the new/delete
                                          // implementation functions
};

struct OOMAbleSysAlloc : public SysAllocator {
  SysAllocator *child;
  int simulate_oom;

  void* Alloc(size_t size, size_t* actual_size, size_t alignment) {
    if (simulate_oom) {
      return nullptr;
    }
    return child->Alloc(size, actual_size, alignment);
  }
};

static OOMAbleSysAlloc* get_test_sys_alloc() {
  static tcmalloc::StaticStorage<OOMAbleSysAlloc> storage;
  return storage.get();
}

void setup_oomable_sys_alloc() {
  SysAllocator *def = MallocExtension::instance()->GetSystemAllocator();

  OOMAbleSysAlloc *alloc = get_test_sys_alloc();
  new (alloc) OOMAbleSysAlloc;
  alloc->child = def;

  MallocExtension::instance()->SetSystemAllocator(alloc);
}

static const int FLAGS_numtests = 50000;
static const int FLAGS_log_every_n_tests = 50000; // log exactly once

// Testing parameters
static const int FLAGS_lgmaxsize = 16;   // lg() of the max size object to alloc
static const int FLAGS_numthreads = 10;  // Number of threads
static const int FLAGS_threadmb = 4;     // Max memory size allocated by thread
static const int FLAGS_lg_max_memalign = 18; // lg of max alignment for memalign

static const double FLAGS_memalign_min_fraction = 0;    // min expected%
static const double FLAGS_memalign_max_fraction = 0.4;  // max expected%
static const double FLAGS_memalign_max_alignment_ratio = 6;  // alignment/size

// Weights of different operations
static const int FLAGS_allocweight = 50;    // Weight for picking allocation
static const int FLAGS_freeweight = 50;     // Weight for picking free
static const int FLAGS_updateweight = 10;   // Weight for picking update
static const int FLAGS_passweight = 1;      // Weight for passing object

static const int kSizeBits = 8 * sizeof(size_t);
static const size_t kMaxSize = ~static_cast<size_t>(0);
static const size_t kMaxSignedSize = ((size_t(1) << (kSizeBits-1)) - 1);

static const size_t kNotTooBig = 100000;
// We want an allocation that is definitely more than main memory.  OS
// X has special logic to discard very big allocs before even passing
// the request along to the user-defined memory allocator; we're not
// interested in testing their logic, so we have to make sure we're
// not *too* big.
static const size_t kTooBig = kMaxSize - 100000;

// To help with generating random numbers
class TestHarness {
 private:
  // Information kept per type
  struct Type {
    std::string name;
    int         type;
    int         weight;
  };

 public:
  TestHarness(int seed) {
    srandom(seed);
  }

  // Add operation type with specified weight.  When starting a new
  // iteration, an operation type is picked with probability
  // proportional to its weight.
  //
  // "type" must be non-negative.
  // "weight" must be non-negative.
  void AddType(int type, int weight, const char* name);

  // Call this to get the type of operation for the next iteration.
  // It returns a random operation type from the set of registered
  // operations.  Returns -1 if tests should finish.
  int PickType();

  // If n == 0, returns the next pseudo-random number in the range [0 .. 0]
  // If n != 0, returns the next pseudo-random number in the range [0 .. n)
  int Uniform(int n) {
    if (n == 0) {
      return random() * 0;
    } else {
      return random() % n;
    }
  }
  // Pick "base" uniformly from range [0,max_log] and then return
  // "base" random bits.  The effect is to pick a number in the range
  // [0,2^max_log-1] with bias towards smaller numbers.
  int Skewed(int max_log) {
    const int base = random() % (max_log+1);
    return random() % (1 << base);
  }

 private:
  std::vector<Type>     types_;             // Registered types
  int                   total_weight_ = 0;  // Total weight of all types
  int                   num_tests_ = 0;     // Num tests run so far
};

void TestHarness::AddType(int type, int weight, const char* name) {
  Type t;
  t.name = name;
  t.type = type;
  t.weight = weight;
  types_.push_back(t);
  total_weight_ += weight;
}

int TestHarness::PickType() {
  if (num_tests_ >= FLAGS_numtests) return -1;
  num_tests_++;

  CHECK(total_weight_ > 0);
  // This is a little skewed if total_weight_ doesn't divide 2^31, but it's close
  int v = Uniform(total_weight_);
  int i;
  for (i = 0; i < types_.size(); i++) {
    v -= types_[i].weight;
    if (v < 0) {
      break;
    }
  }

  CHECK(i < types_.size());
  if ((num_tests_ % FLAGS_log_every_n_tests) == 0) {
    printf("  Test %d out of %d: %s\n",
            num_tests_, FLAGS_numtests, types_[i].name.c_str());
  }
  return types_[i].type;
}

class AllocatorState : public TestHarness {
 public:
  explicit AllocatorState(int seed) : TestHarness(seed), memalign_fraction_(0) {
    if (kOSSupportsMemalign) {
      CHECK_GE(FLAGS_memalign_max_fraction, 0);
      CHECK_LE(FLAGS_memalign_max_fraction, 1);
      CHECK_GE(FLAGS_memalign_min_fraction, 0);
      CHECK_LE(FLAGS_memalign_min_fraction, 1);
      double delta = FLAGS_memalign_max_fraction - FLAGS_memalign_min_fraction;
      CHECK_GE(delta, 0);
      memalign_fraction_ = (Uniform(10000)/10000.0 * delta +
                            FLAGS_memalign_min_fraction);
      //printf("memalign fraction: %f\n", memalign_fraction_);
    }
  }
  virtual ~AllocatorState() {}

  // Allocate memory.  Randomly choose between malloc() or posix_memalign().
  void* alloc(size_t size) {
    if (Uniform(100) < memalign_fraction_ * 100) {
      // Try a few times to find a reasonable alignment, or fall back on malloc.
      for (int i = 0; i < 5; i++) {
        size_t alignment = size_t{1} << Uniform(FLAGS_lg_max_memalign);
        if (alignment >= sizeof(intptr_t) &&
            (size < sizeof(intptr_t) ||
             alignment < FLAGS_memalign_max_alignment_ratio * size)) {
          void *result = reinterpret_cast<void*>(static_cast<intptr_t>(0x1234));
          int err = PosixMemalign(&result, alignment, size);
          if (err != 0) {
            CHECK_EQ(err, ENOMEM);
          }
          return err == 0 ? result : nullptr;
        }
      }
    }
    return noopt(malloc(size));
  }

 private:
  double memalign_fraction_;
};


// Info kept per thread
class TesterThread {
 private:
  // Info kept per allocated object
  struct Object {
    char*       ptr;                    // Allocated pointer
    int         size;                   // Allocated size
    int         generation;             // Generation counter of object contents
  };

  std::vector<std::unique_ptr<TesterThread>> &all_threads_;

  std::mutex            lock_;          // For passing in another thread's obj
  int                   id_;            // My thread id
  AllocatorState        rnd_;           // For generating random numbers
  std::vector<Object>   heap_;          // This thread's heap
  std::vector<Object>   passed_;        // Pending objects passed from others
  size_t                heap_size_;     // Current heap size

  // Type of operations
  enum Type { ALLOC, FREE, UPDATE, PASS };

  // ACM minimal standard random number generator.  (re-entrant.)
  class ACMRandom {
    int32_t seed_;
   public:
    explicit ACMRandom(int32_t seed) { seed_ = seed; }
    int32_t Next() {
      const int32_t M = 2147483647L;   // 2^31-1
      const int32_t A = 16807;
      // In effect, we are computing seed_ = (seed_ * A) % M, where M = 2^31-1
      uint32_t lo = A * (int32_t)(seed_ & 0xFFFF);
      uint32_t hi = A * (int32_t)((uint32_t)seed_ >> 16);
      lo += (hi & 0x7FFF) << 16;
      if (lo > M) {
        lo &= M;
        ++lo;
      }
      lo += hi >> 15;
      if (lo > M) {
        lo &= M;
        ++lo;
      }
      return (seed_ = (int32_t) lo);
    }
  };

 public:
  TesterThread(std::vector<std::unique_ptr<TesterThread>>& all_threads, int id)
    : all_threads_(all_threads),
      id_(id),
      rnd_(id+1),
      heap_size_(0) {
  }

  virtual ~TesterThread() {
  }

  virtual void Run() {
    rnd_.AddType(ALLOC,  FLAGS_allocweight,   "allocate");
    rnd_.AddType(FREE,   FLAGS_freeweight,    "free");
    rnd_.AddType(UPDATE, FLAGS_updateweight,  "update");
    rnd_.AddType(PASS,   FLAGS_passweight,    "pass");

    while (true) {
      AcquirePassedObjects();

      switch (rnd_.PickType()) {
        case ALLOC:   AllocateObject(); break;
        case FREE:    FreeObject();     break;
        case UPDATE:  UpdateObject();   break;
        case PASS:    PassObject();     break;
        case -1:      goto done;
        default:      CHECK(nullptr == "Unknown type");
      }

      ShrinkHeap();
    }

 done:
    DeleteHeap();
  }

  // Allocate a new object
  void AllocateObject() {
    Object object;
    object.size = rnd_.Skewed(FLAGS_lgmaxsize);
    object.ptr = static_cast<char*>(rnd_.alloc(object.size));
    CHECK(object.ptr);
    object.generation = 0;
    FillContents(&object);
    heap_.push_back(object);
    heap_size_ += object.size;
  }

  // Mutate a random object
  void UpdateObject() {
    if (heap_.empty()) return;
    const int index = rnd_.Uniform(heap_.size());
    CheckContents(heap_[index]);
    heap_[index].generation++;
    FillContents(&heap_[index]);
  }

  // Free a random object
  void FreeObject() {
    if (heap_.empty()) return;
    const int index = rnd_.Uniform(heap_.size());
    Object object = heap_[index];
    CheckContents(object);
    free(object.ptr);
    heap_size_ -= object.size;
    heap_[index] = heap_[heap_.size()-1];
    heap_.pop_back();
  }

  // Delete all objects in the heap
  void DeleteHeap() {
    while (!heap_.empty()) {
      FreeObject();
    }
  }

  // Free objects until our heap is small enough
  void ShrinkHeap() {
    while (heap_size_ > FLAGS_threadmb << 20) {
      CHECK(!heap_.empty());
      FreeObject();
    }
  }

  // Pass a random object to another thread
  void PassObject() {
    // Pick object to pass
    if (heap_.empty()) return;
    const int index = rnd_.Uniform(heap_.size());
    Object object = heap_[index];
    CheckContents(object);

    // Pick thread to pass
    const int tid = rnd_.Uniform(FLAGS_numthreads);
    TesterThread* thread = all_threads_[tid].get();

    if (thread->lock_.try_lock()) {
      // Pass the object
      thread->passed_.push_back(object);
      thread->lock_.unlock();
      heap_size_ -= object.size;
      heap_[index] = heap_[heap_.size()-1];
      heap_.pop_back();
    }
  }

  // Grab any objects passed to this thread by another thread
  void AcquirePassedObjects() {
    // We do not create unnecessary contention by always using
    // TryLock().  Plus we unlock immediately after swapping passed
    // objects into a local vector.
    std::vector<Object> copy;
    { // Locking scope
      if (!lock_.try_lock()) {
        return;
      }
      swap(copy, passed_);
      lock_.unlock();
    }

    for (int i = 0; i < copy.size(); ++i) {
      const Object& object = copy[i];
      CheckContents(object);
      heap_.push_back(object);
      heap_size_ += object.size;
    }
  }

  // Fill object contents according to ptr/generation
  void FillContents(Object* object) {
    ACMRandom r(reinterpret_cast<intptr_t>(object->ptr) & 0x7fffffff);
    for (int i = 0; i < object->generation; ++i) {
      r.Next();
    }
    const char c = static_cast<char>(r.Next());
    memset(object->ptr, c, object->size);
  }

  // Check object contents
  void CheckContents(const Object& object) {
    ACMRandom r(reinterpret_cast<intptr_t>(object.ptr) & 0x7fffffff);
    for (int i = 0; i < object.generation; ++i) {
      r.Next();
    }

    // For large objects, we just check a prefix/suffix
    const char expected = static_cast<char>(r.Next());
    const int limit1 = object.size < 32 ? object.size : 32;
    const int start2 = limit1 > object.size - 32 ? limit1 : object.size - 32;
    for (int i = 0; i < limit1; ++i) {
      CHECK_EQ(object.ptr[i], expected);
    }
    for (int i = start2; i < object.size; ++i) {
      CHECK_EQ(object.ptr[i], expected);
    }
  }
};

TEST(TCMallocTest, Versions) {
  auto build_version_string = [] (int major, int minor, const char* patch) -> std::string {
    CHECK(patch[0] == 0 || patch[0] == '.'); // patch version needs to start with dot
    std::stringstream ss;
    ss << "gperftools " << major << "." << minor << patch;
    return ss.str();
  };

  // We make sure that TC_VERSION_STRING define matches
  // TC_VERSION_MAJOR, TC_VERSION_MAJOR and TC_VERSION_PATCH (see
  // tcmalloc.h)
  std::string expected_version_string = build_version_string(TC_VERSION_MAJOR, TC_VERSION_MINOR, TC_VERSION_PATCH);
  ASSERT_EQ(expected_version_string, std::string(TC_VERSION_STRING));

  // autoconf's config.h has PACKAGE_VERSION that is taken from configure.ac
#if defined(PACKAGE_VERSION)
  // And we make sure that autoconf's idea of version matches what
  // we've manually put into tcmalloc.h
  ASSERT_EQ(expected_version_string, std::string("gperftools ") + PACKAGE_VERSION);
#else
  // Make sure we're able to exercise line above (we set this
  // environment variable in test runner)
  CHECK_EQ(getenv("GPERFTOOLS_ENSURE_PACKAGE_VERSION"), nullptr);
#endif
}

TEST(TCMallocTest, ManyThreads) {
  printf("Testing threaded allocation/deallocation (%d threads)\n",
          FLAGS_numthreads);

  std::vector<std::unique_ptr<TesterThread>> ptrs;
  ptrs.reserve(FLAGS_numthreads);
  // Note, the logic inside PassObject requires us to create all
  // TesterThreads first, before starting any of them.
  for (int i = 0; i < FLAGS_numthreads; i++) {
    ptrs.emplace_back(std::make_unique<TesterThread>(ptrs, i));
  }

  std::vector<std::thread> threads;
  threads.reserve(FLAGS_numthreads);
  for (int i = 0; i < FLAGS_numthreads; i++) {
    threads.emplace_back([thr = ptrs[i].get()] () {
      thr->Run();
    });
  }
  for (auto& t : threads) {
    t.join();
  }
}

static void TryHugeAllocation(size_t s, AllocatorState* rnd) {
  void* p = rnd->alloc(noopt(s));
  CHECK(p == nullptr);   // huge allocation s should fail!
}

static void TestHugeAllocations(AllocatorState* rnd) {
  // Check that asking for stuff tiny bit smaller than largest possible
  // size returns nullptr.
  for (size_t i = 0; i < 70000; i += rnd->Uniform(20)) {
    TryHugeAllocation(kMaxSize - i, rnd);
  }
  // Asking for memory sizes near signed/unsigned boundary (kMaxSignedSize)
  // might work or not, depending on the amount of virtual memory.
  if (!TestingPortal::Get()->IsDebuggingMalloc()) {
   // debug allocation takes forever for huge allocs
    for (size_t i = 0; i < 100; i++) {
      void* p = nullptr;
      p = rnd->alloc(kMaxSignedSize + i);
      if (p) free(p);    // if: free(nullptr) is not necessarily defined
      p = rnd->alloc(kMaxSignedSize - i);
      if (p) free(p);
    }
  }

  // Check that ReleaseFreeMemory has no visible effect (aka, does not
  // crash the test):
  MallocExtension* inst = MallocExtension::instance();
  CHECK(inst);
  inst->ReleaseFreeMemory();
}

static void TestCalloc(size_t n, size_t s, bool ok) {
  char* p = reinterpret_cast<char*>(noopt(calloc)(n, s));
  if (!ok) {
    CHECK(p == nullptr);  // calloc(n, s) should not succeed
  } else {
    CHECK(p != nullptr);  // calloc(n, s) should succeed
    for (int i = 0; i < n*s; i++) {
      CHECK(p[i] == '\0');
    }
    free(p);
  }
}

// This makes sure that reallocing a small number of bytes in either
// direction doesn't cause us to allocate new memory.
TEST(TCMallocTest, Realloc) {
  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    // debug alloc doesn't try to minimize reallocs
    return;
  }
  // When sampling, we always allocate in units of page-size, which
  // makes reallocs of small sizes do extra work (thus, failing these
  // checks).  Since sampling is random, we turn off sampling to make
  // sure that doesn't happen to us here.

  // turn off sampling
  tcmalloc::Cleanup cleanup = SetFlag(&TestingPortal::Get()->GetSampleParameter(), 0);

  int start_sizes[] = { 100, 1000, 10000, 100000 };
  int deltas[] = { 1, -2, 4, -8, 16, -32, 64, -128 };

  for (int s = 0; s < sizeof(start_sizes)/sizeof(*start_sizes); ++s) {
    void* p = noopt(malloc(start_sizes[s]));
    ASSERT_NE(p, nullptr);
    // The larger the start-size, the larger the non-reallocing delta.
    for (int d = 0; d < (s+1) * 2; ++d) {
      void* new_p = noopt(realloc)(p, start_sizes[s] + deltas[d]);
      ASSERT_EQ(p, new_p);  // realloc should not allocate new memory
    }
    // Test again, but this time reallocing smaller first.
    for (int d = 0; d < s*2; ++d) {
      void* new_p = noopt(realloc)(p, start_sizes[s] - deltas[d]);
      ASSERT_EQ(p, new_p);  // realloc should not allocate new memory
    }
    free(p);
  }
}

#if __cpp_exceptions
static int news_handled = 0;

static void TestNewHandler() {
  ++news_handled;
  throw std::bad_alloc();
}

static void TestOneNew(void* (*func)(size_t)) {
  func = noopt(func);
  // success test
  try {
    void* ptr = (*func)(kNotTooBig);
    if (0 == ptr) {
      printf("allocation should not have failed.\n");
      abort();
    }
  } catch (...) {
    printf("allocation threw unexpected exception.\n");
    abort();
  }

  // failure test
  // we should always receive a bad_alloc exception
  try {
    (*func)(kTooBig);
    printf("allocation should have failed.\n");
    abort();
  } catch (const std::bad_alloc&) {
    // correct
  } catch (...) {
    printf("allocation threw unexpected exception.\n");
    abort();
  }
}

static void TestNew(void* (*func)(size_t)) {
  news_handled = 0;

  // test without new_handler:
  std::new_handler saved_handler = std::set_new_handler(0);
  TestOneNew(func);

  // test with new_handler:
  std::set_new_handler(TestNewHandler);
  TestOneNew(func);
  if (news_handled != 1) {
    printf("new_handler was not called.\n");
    abort();
  }
  std::set_new_handler(saved_handler);
}

static void TestOneNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) {
  func = noopt(func);
  // success test
  try {
    void* ptr = (*func)(kNotTooBig, std::nothrow);
    if (ptr == nullptr) {
      printf("allocation should not have failed.\n");
      abort();
    }
  } catch (...) {
    printf("allocation threw unexpected exception.\n");
    abort();
  }

  // failure test
  // we should always receive a bad_alloc exception
  try {
    if ((*func)(kTooBig, std::nothrow) != 0) {
      printf("allocation should have failed.\n");
      abort();
    }
  } catch (...) {
    printf("nothrow allocation threw unexpected exception.\n");
    abort();
  }
}

static void TestNothrowNew(void* (*func)(size_t, const std::nothrow_t&)) {
  news_handled = 0;

  // test without new_handler:
  std::new_handler saved_handler = std::set_new_handler(0);
  TestOneNothrowNew(func);

  // test with new_handler:
  std::set_new_handler(TestNewHandler);
  TestOneNothrowNew(func);
  if (news_handled != 1) {
    printf("nothrow new_handler was not called.\n");
    abort();
  }
  std::set_new_handler(saved_handler);
}

TEST(TCMallocTest, OperatorsNewOOMs) {
  printf("Testing operator new(nothrow).\n");
  TestNothrowNew(&::operator new);
  printf("Testing operator new[](nothrow).\n");
  TestNothrowNew(&::operator new[]);
  printf("Testing operator new.\n");
  TestNew(&::operator new);
  printf("Testing operator new[].\n");
  TestNew(&::operator new[]);
}

#endif  // __cpp_exceptions


// These are used as callbacks by the sanity-check.  Set* and Reset*
// register the hook that counts how many times the associated memory
// function is called.  After each such call, call Verify* to verify
// that we used the tcmalloc version of the call, and not the libc.
// Note the ... in the hook signature: we don't care what arguments
// the hook takes.
#define MAKE_HOOK_CALLBACK(hook_type, ...)                              \
  static volatile int g_##hook_type##_calls = 0;                                 \
  static void IncrementCallsTo##hook_type(__VA_ARGS__) {                \
    g_##hook_type##_calls++;                                            \
  }                                                                     \
  static void Verify##hook_type##WasCalled() {                          \
    CHECK_GT(g_##hook_type##_calls, 0);                                 \
    g_##hook_type##_calls = 0;  /* reset for next call */               \
  }                                                                     \
  static void Set##hook_type() {                                        \
    CHECK(MallocHook::Add##hook_type(                                   \
        (MallocHook::hook_type)&IncrementCallsTo##hook_type));          \
  }                                                                     \
  static void Reset##hook_type() {                                      \
    g_##hook_type##_calls = 0;                                          \
    CHECK(MallocHook::Remove##hook_type(                                \
        (MallocHook::hook_type)&IncrementCallsTo##hook_type));          \
  }

// We do one for each hook typedef in malloc_hook.h
MAKE_HOOK_CALLBACK(NewHook, const void*, size_t);
MAKE_HOOK_CALLBACK(DeleteHook, const void*);

static void TestAlignmentForSize(int size) {
  const size_t min_align = TestingPortal::Get()->GetMinAlign();

  printf("Testing alignment of malloc(%d)\n", size);
  static const int kNum = 100;
  void* ptrs[kNum];
  for (int i = 0; i < kNum; i++) {
    ptrs[i] = malloc(size);
    uintptr_t p = reinterpret_cast<uintptr_t>(ptrs[i]);
    CHECK((p % sizeof(void*)) == 0);
    CHECK((p % sizeof(double)) == 0);

    // Must have 16-byte (or 8-byte in case of -DTCMALLOC_ALIGN_8BYTES)
    // alignment for large enough objects
    if (size >= min_align) {
      CHECK((p % min_align) == 0);
    }
  }
  for (int i = 0; i < kNum; i++) {
    free(ptrs[i]);
  }
}

TEST(TCMallocTest, MallocAlignment) {
  for (int lg = 0; lg < 16; lg++) {
    TestAlignmentForSize((1<<lg) - 1);
    TestAlignmentForSize((1<<lg) + 0);
    TestAlignmentForSize((1<<lg) + 1);
  }
}

TEST(TCMallocTest, HugeThreadCache) {
  printf("==== Testing huge thread cache\n");
  // More than 2^16 to cause integer overflow of 16 bit counters.
  static const int kNum = 70000;
  char** array = new char*[kNum];
  for (int i = 0; i < kNum; ++i) {
    array[i] = new char[10];
  }
  for (int i = 0; i < kNum; ++i) {
    delete[] array[i];
  }
  delete[] array;
}

// Check that at least one of the callbacks from Ranges() contains
// the specified address with the specified type, and has size
// >= min_size.
static void CheckRangeCallback(void* ptr, base::MallocRange::Type type,
                               size_t min_size) {
  bool matched = false;
  const uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
  auto callback = [&] (const base::MallocRange* r) -> void {
    if (!(r->address <= addr && addr < r->address + r->length)) {
      return;
    }

    if (type == base::MallocRange::FREE) {
      // We are expecting r->type == FREE, but ReleaseMemory
      // may have already moved us to UNMAPPED state instead (this happens in
      // approximately 0.1% of executions). Accept either state.
      CHECK(r->type == base::MallocRange::FREE ||
            r->type == base::MallocRange::UNMAPPED);
    } else {
      CHECK_EQ(r->type, type);
    }
    CHECK_GE(r->length, min_size);

    matched = true;
  };

  tcmalloc::FunctionRefFirstDataArg<void(const base::MallocRange*)> ref(callback);
  MallocExtension::instance()->Ranges(ref.data, ref.fn);
  EXPECT_TRUE(matched);
}

TEST(TCMallocTest, Ranges) {
  static const int MB = 1048576;
  void* a = malloc(MB);
  void* b = malloc(MB);
  base::MallocRange::Type releasedType =
    TestingPortal::Get()->HaveSystemRelease() ? base::MallocRange::UNMAPPED : base::MallocRange::FREE;

  CheckRangeCallback(a, base::MallocRange::INUSE, MB);
  CheckRangeCallback(b, base::MallocRange::INUSE, MB);

  (noopt(free))(a);

  CheckRangeCallback(a, base::MallocRange::FREE, MB);
  CheckRangeCallback(b, base::MallocRange::INUSE, MB);

  MallocExtension::instance()->ReleaseFreeMemory();

  CheckRangeCallback(a, releasedType, MB);
  CheckRangeCallback(b, base::MallocRange::INUSE, MB);

  (noopt(free))(b);

  CheckRangeCallback(a, releasedType, MB);
  CheckRangeCallback(b, base::MallocRange::FREE, MB);
}

static size_t GetUnmappedBytes() {
  size_t bytes;
  CHECK(MallocExtension::instance()->GetNumericProperty(
          "tcmalloc.pageheap_unmapped_bytes", &bytes));
  return bytes;
}

TEST(TCMallocTest, ReleaseToSystem) {
  // Debug allocation mode adds overhead to each allocation which
  // messes up all the equality tests here.  I just disable the
  // test in this mode.
  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    return;
  }

  if(!TestingPortal::Get()->HaveSystemRelease()) return;

  tcmalloc::Cleanup release_rate_cleanup = SetFlag(&TestingPortal::Get()->GetReleaseRate(), 0);
  tcmalloc::Cleanup decommit_cleanup = kAggressiveDecommit.Override(0);

  static const int MB = 1048576;
  void* a = noopt(malloc(MB));
  void* b = noopt(malloc(MB));
  MallocExtension::instance()->ReleaseFreeMemory();
  size_t starting_bytes = GetUnmappedBytes();

  // Calling ReleaseFreeMemory() a second time shouldn't do anything.
  MallocExtension::instance()->ReleaseFreeMemory();
  EXPECT_EQ(starting_bytes, GetUnmappedBytes());

  // ReleaseToSystem shouldn't do anything either.
  MallocExtension::instance()->ReleaseToSystem(MB);
  EXPECT_EQ(starting_bytes, GetUnmappedBytes());

  free(a);

  // The span to release should be 1MB.
  MallocExtension::instance()->ReleaseToSystem(MB/2);
  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());

  // Should do nothing since the previous call released too much.
  MallocExtension::instance()->ReleaseToSystem(MB/4);
  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());

  free(b);

  // Use up the extra MB/4 bytes from 'a' and also release 'b'.
  MallocExtension::instance()->ReleaseToSystem(MB/2);
  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());

  // Should do nothing since the previous call released too much.
  MallocExtension::instance()->ReleaseToSystem(MB/2);
  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());

  // Nothing else to release.
  MallocExtension::instance()->ReleaseFreeMemory();
  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());

  a = noopt(malloc(MB));
  free(a);
  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());

  // Releasing less than a page should still trigger a release.
  MallocExtension::instance()->ReleaseToSystem(1);
  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());
}

TEST(TCMallocTest, LargeAllocsRelease) {
  // Debug allocation mode adds overhead to each allocation which
  // messes up all the equality tests here.  I just disable the
  // test in this mode.
  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    return;
  }

  if(!TestingPortal::Get()->HaveSystemRelease()) return;

  tcmalloc::Cleanup release_rate_cleanup = SetFlag(&TestingPortal::Get()->GetReleaseRate(), 0);
  tcmalloc::Cleanup decommit_cleanup = kAggressiveDecommit.Override(0);

  // This test verifies special logic where page heap prefers reusing
  // normal spans over touching returned spans for large allocations
  // where spans are of the same size.
  //
  // We have the same logic for non-large spans.
  //
  // See github pull request
  // https://github.com/gperftools/gperftools/pull/1604 and commit
  // 32f11cb4b777880f7ecff3edcb5bc04fd6f1dff1 for motivation.

  constexpr size_t kNumPtrs = 10;
  constexpr size_t kBigAllocBytes = 3 << 20;

  std::vector<std::unique_ptr<char[]>> cleanup;
  std::vector<std::unique_ptr<char[]>> chunks;

  auto alloc_big = [&] () -> std::unique_ptr<char[]> {
    return std::unique_ptr<char[]>{noopt<char*>(new char[kBigAllocBytes])};
  };

  for (;;) {
    // Ensure there is big large chunk of memory that is available. We
    // want kNumPtrs * 2 successive chunks to be allocated in this
    // space. This test is explicitly very picky in what behavior it
    // triggers.
    free(noopt(malloc(kNumPtrs * 2 * kBigAllocBytes)));

    size_t i;
    for (i = 0; i < kNumPtrs * 2; i++) {
      chunks.emplace_back(alloc_big());
      if (i > 0) {
        if (chunks.rbegin()->get() != (chunks.rbegin()+1)->get() + kBigAllocBytes) {
          static int num_fail;
          printf("successive allocation failure %d. Will retry\n", ++num_fail);
          ASSERT_LE(num_fail, 32);
          break;
        }
      }
    }
    if (i == kNumPtrs * 2) {
      break; // success
    }

    // Whatever we've got so far, lets ensure it is cleaned up. But after the test.
    std::move(chunks.begin(), chunks.end(), std::back_inserter(cleanup));
    chunks.clear();
  }

  std::array<std::unique_ptr<char[]>, kNumPtrs> used_ptrs;
  std::array<std::unique_ptr<char[]>, kNumPtrs> free_ptrs;

  for (size_t i = 0; i < kNumPtrs; ++i) {
    // interleave used_ptrs and free_ptrs to prevent free_ptrs from coalescing
    used_ptrs[i] = std::move(chunks[i * 2]);
    free_ptrs[i] = std::move(chunks[i * 2 + 1]);
  }

  MallocExtension::instance()->ReleaseFreeMemory();

  size_t starting_bytes = GetUnmappedBytes();

  for (auto& ptr : free_ptrs) {
    ptr.reset();
  }
  // Ensure that free-s just above did not cause any returns of memory
  // to the kernel.
  EXPECT_EQ(starting_bytes, GetUnmappedBytes());

  // Here is the logic. So we're at the stage where only normal spans
  // are from free-s (unique_ptr resets) just above. And there is some
  // number of returned spans. As we call ReleaseToSystem with the
  // exact span size, we will return one of those to the kernel and
  // move the span to returned list.
  for (size_t i = 0; i < 2 * kNumPtrs; ++i) {
    MallocExtension::instance()->ReleaseToSystem(kBigAllocBytes);
    // Then we expect the following allocation to take one of those
    // normal spans (despite just returned span to have lower address).
    //
    // I.e. we want to avoid allocating the memory we just returned to
    // the kernel. Which would grow RSS unnecessarily.
    auto a = alloc_big();
    a.reset();
  }
  MallocExtension::instance()->ReleaseToSystem(kBigAllocBytes);
  // And finally we ensure that, indeed, we've returned all the chunks
  // we've freed.
  EXPECT_EQ(starting_bytes + kNumPtrs * kBigAllocBytes, GetUnmappedBytes());
}

TEST(TCMallocTest, AggressiveDecommit) {
  // Debug allocation mode adds overhead to each allocation which
  // messes up all the equality tests here.  I just disable the
  // teset in this mode.
  if(TestingPortal::Get()->IsDebuggingMalloc() || !TestingPortal::Get()->HaveSystemRelease()) {
    return;
  }

  printf("Testing aggressive de-commit\n");

  MallocExtension::instance()->ReleaseFreeMemory();

  tcmalloc::Cleanup cleanup = kAggressiveDecommit.Override(1);

  static const int MB = 1048576;
  void* a = noopt(malloc(MB));
  void* b = noopt(malloc(MB));

  size_t starting_bytes = GetUnmappedBytes();

  // ReleaseToSystem shouldn't do anything either.
  MallocExtension::instance()->ReleaseToSystem(MB);
  EXPECT_EQ(starting_bytes, GetUnmappedBytes());

  free(a);

  // The span to release should be 1MB.
  EXPECT_EQ(starting_bytes + MB, GetUnmappedBytes());

  free(b);

  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());

  // Nothing else to release.
  MallocExtension::instance()->ReleaseFreeMemory();
  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());

  a = noopt(malloc(MB));
  free(a);

  EXPECT_EQ(starting_bytes + 2*MB, GetUnmappedBytes());

  printf("Done testing aggressive de-commit\n");
}

// On MSVC10, in release mode, the optimizer convinces itself
// g_no_memory is never changed (I guess it doesn't realize OnNoMemory
// might be called).  Work around this by setting the var volatile.
volatile bool g_no_memory;
std::new_handler g_old_handler;
static void OnNoMemory() {
  g_no_memory = true;
  std::set_new_handler(g_old_handler);
}

TEST(TCMallocTest, SetNewMode) {
  int old_mode = tc_set_new_mode(1);

  g_old_handler = std::set_new_handler(&OnNoMemory);
  g_no_memory = false;
  void* ret = noopt(malloc(noopt(kTooBig)));
  EXPECT_EQ(nullptr, ret);
  EXPECT_TRUE(g_no_memory);

  g_old_handler = std::set_new_handler(&OnNoMemory);
  g_no_memory = false;
  ret = noopt(calloc(1, noopt(kTooBig)));
  EXPECT_EQ(nullptr, ret);
  EXPECT_TRUE(g_no_memory);

  g_old_handler = std::set_new_handler(&OnNoMemory);
  g_no_memory = false;
  ret = noopt(realloc(nullptr, noopt(kTooBig)));
  EXPECT_EQ(nullptr, ret);
  EXPECT_TRUE(g_no_memory);

  if (kOSSupportsMemalign) {
    // Not really important, but must be small enough such that
    // kAlignment + kTooBig does not overflow.
    const int kAlignment = 1 << 5;

    g_old_handler = std::set_new_handler(&OnNoMemory);
    g_no_memory = false;
    ret = Memalign(kAlignment, kTooBig);
    EXPECT_EQ(nullptr, ret);
    EXPECT_TRUE(g_no_memory);

    g_old_handler = std::set_new_handler(&OnNoMemory);
    g_no_memory = false;
    EXPECT_EQ(ENOMEM,
              PosixMemalign(&ret, kAlignment, kTooBig));
    EXPECT_EQ(nullptr, ret);
    EXPECT_TRUE(g_no_memory);
  }

  tc_set_new_mode(old_mode);
}

TEST(TCMallocTest, TestErrno) {
  void* ret;
  if (kOSSupportsMemalign) {
    errno = 0;
    ret = Memalign(128, kTooBig);
    EXPECT_EQ(nullptr, ret);
    EXPECT_EQ(ENOMEM, errno);
  }

  errno = 0;
  ret = noopt(malloc(noopt(kTooBig)));
  EXPECT_EQ(nullptr, ret);
  EXPECT_EQ(ENOMEM, errno);

  errno = 0;
  ret = tc_malloc_skip_new_handler(kTooBig);
  EXPECT_EQ(nullptr, ret);
  EXPECT_EQ(ENOMEM, errno);
}

// Ensure that nallocx works before main.
struct GlobalNallocx {
  GlobalNallocx() {
    if (!TestingPortal::Get()->IsDebuggingMalloc()) {
      CHECK_GT(nallocx(99, 0), 99);
    }
  }
} global_nallocx;

#if defined(__GNUC__)

static void check_global_nallocx() __attribute__((constructor));
static void check_global_nallocx() {
  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    return;
  }

  CHECK_GT(nallocx(99, 0), 99);
}

#endif // __GNUC__

static size_t GrowNallocxTestSize(size_t sz) {
  if (sz < 1024) {
    return sz + 7;
  }

  size_t divided = sz >> 7;
  divided |= (divided >> 1);
  divided |= (divided >> 2);
  divided |= (divided >> 4);
  divided |= (divided >> 8);
  divided |= (divided >> 16);
  divided += 1;
  return sz + divided;
}

TEST(TCMallocTest, NAllocX) {
  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    return;
  }

  for (size_t size = 0; size <= (1 << 20); size = GrowNallocxTestSize(size)) {
    size_t rounded = nallocx(size, 0);
    ASSERT_GE(rounded, size);
    void* ptr = malloc(size);
    ASSERT_EQ(rounded, MallocExtension::instance()->GetAllocatedSize(ptr));
    free(ptr);
  }
}

TEST(TCMallocTest, NAllocXAlignment) {
  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    return;
  }

  for (size_t size = 0; size <= (1 << 20); size = GrowNallocxTestSize(size)) {
    for (size_t align_log = 0; align_log < 10; align_log++) {
      size_t rounded = nallocx(size, MALLOCX_LG_ALIGN(align_log));
      size_t align = size_t{1} << align_log;
      ASSERT_GE(rounded, size);
      ASSERT_EQ(rounded % align, 0);
      void* ptr = tc_memalign(align, size);
      ASSERT_EQ(rounded, MallocExtension::instance()->GetAllocatedSize(ptr));
      free(ptr);
    }
  }
}

struct NewHandlerHelper {
  NewHandlerHelper(NewHandlerHelper* prev) : prev(prev) {
    memset(filler, 0, sizeof(filler));
  }

  NewHandlerHelper* Pop() {
    NewHandlerHelper* prev = this->prev;
    delete this;
    return prev;
  }

  NewHandlerHelper* const prev;
  char filler[512];
};

static int saw_new_handler_runs;
static NewHandlerHelper* oom_test_last_ptr;

static void test_new_handler() {
  oom_test_last_ptr = oom_test_last_ptr->Pop();
  saw_new_handler_runs++;
}

TEST(TCMallocTest, NewHandler) {
  if (running_fork_testing) return;

  // debug allocator does internal allocations and crashes when such
  // internal allocation fails. So don't test it.
  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    return;
  }

  ASSERT_EQ(oom_test_last_ptr, nullptr);
  ASSERT_EQ(saw_new_handler_runs, 0);
  tcmalloc::Cleanup clean_oom_testers([] () {
    while (oom_test_last_ptr) {
      oom_test_last_ptr = oom_test_last_ptr->Pop();
    }
  });

  setup_oomable_sys_alloc();

  std::new_handler old = std::set_new_handler(test_new_handler);
  get_test_sys_alloc()->simulate_oom = true;
  tcmalloc::Cleanup restore_oom([] () {
    get_test_sys_alloc()->simulate_oom = false;
  });

  ASSERT_EQ(saw_new_handler_runs, 0);

  // After we enabled "simulate oom" behavior in sys allocator, we may
  // need to allocate a lot of NewHandlerHelper instances until all
  // the page heap free reserves are consumed and we're hitting
  // sysallocator. So we have a linked list of thoses and keep
  // allocating until we see our test_new_handler runs.
  //
  // Note, there is also slight chance that we'll hit crash while
  // failing to allocate internal metadata. It doesn't happen often
  // (and not with default order of tests), but something we'll need
  // to fix one day.
  for (int i = 1<<24; i > 0; i--) {
    oom_test_last_ptr = noopt(new NewHandlerHelper(oom_test_last_ptr));
    ASSERT_NE(oom_test_last_ptr, nullptr);
    if (saw_new_handler_runs) {
      break;
    }
  }

  ASSERT_EQ(saw_new_handler_runs, 1);

  std::set_new_handler(old);
}

TEST(TCMallocTest, AllTests) {
  AllocatorState rnd(100);

  // Check that empty allocation works
  printf("Testing empty allocation\n");
  {
    void* p1 = rnd.alloc(0);
    ASSERT_NE(p1, nullptr);
    void* p2 = rnd.alloc(0);
    ASSERT_NE(p2, nullptr);
    ASSERT_NE(p1, p2);
    free(p1);
    free(p2);
  }

  // This code stresses some of the memory allocation via STL.
  // It may call operator delete(void*, nothrow_t).
  printf("Testing STL use\n");
  {
    std::vector<int> v;
    v.push_back(1);
    v.push_back(2);
    v.push_back(3);
    v.push_back(0);
    std::stable_sort(v.begin(), v.end());
  }

#ifdef ENABLE_SIZED_DELETE
  {
    printf("Testing large sized delete is not crashing\n");
    // Large sized delete
    // case. https://github.com/gperftools/gperftools/issues/1254
    std::vector<char*> addresses;
    constexpr int kSizedDepth = 1024;
    addresses.reserve(kSizedDepth);
    for (int i = 0; i < kSizedDepth; i++) {
      addresses.push_back(noopt(new char[12686]));
    }
    for (int i = 0; i < kSizedDepth; i++) {
      ::operator delete[](addresses[i], 12686);
    }
  }
#endif

  // Test each of the memory-allocation functions once, just as a sanity-check
  printf("Sanity-testing all the memory allocation functions\n");
  {
    // We use new-hook and delete-hook to verify we actually called the
    // tcmalloc version of these routines, and not the libc version.
    SetNewHook();      // defined as part of MAKE_HOOK_CALLBACK, above
    SetDeleteHook();   // ditto
    tcmalloc::Cleanup unhook([] () {
      // Reset the hooks to what they used to be.  These are all
      // defined as part of MAKE_HOOK_CALLBACK, above.
      ResetNewHook();
      ResetDeleteHook();
    });

    void* p1 = noopt(malloc)(10);
    ASSERT_NE(p1, nullptr);    // force use of this variable
    VerifyNewHookWasCalled();
    // Also test the non-standard tc_malloc_size
    size_t actual_p1_size = tc_malloc_size(p1);
    ASSERT_GE(actual_p1_size, 10);
    ASSERT_LT(actual_p1_size, 100000);   // a reasonable upper-bound, I think
    free(p1);
    VerifyDeleteHookWasCalled();

    p1 = noopt(malloc)(10);
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    tc_free_sized(p1, 10);
    VerifyDeleteHookWasCalled();

    // sadly windows stuff lacks aligned_alloc
    // (https://learn.microsoft.com/en-us/cpp/standard-library/cstdlib?view=msvc-170#remarks-6)
    p1 = noopt(tc_memalign)(1, 10);
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    tc_free_aligned_sized(p1, 1, 10);
    VerifyDeleteHookWasCalled();

    p1 = tc_malloc_skip_new_handler(10);
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    free(p1);
    VerifyDeleteHookWasCalled();

    p1 = noopt(calloc)(10, 2);
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    // We make sure we realloc to a big size, since some systems (OS
    // X) will notice if the realloced size continues to fit into the
    // malloc-block and make this a noop if so.
    p1 = noopt(realloc)(p1, 30000);
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    VerifyDeleteHookWasCalled();
    free(p1);
    VerifyDeleteHookWasCalled();

    if (kOSSupportsMemalign) {
      ASSERT_EQ(noopt(PosixMemalign)(&p1, sizeof(p1), 40), 0);
      ASSERT_NE(p1, nullptr);
      VerifyNewHookWasCalled();
      free(p1);
      VerifyDeleteHookWasCalled();

      p1 = noopt(Memalign)(sizeof(p1) * 2, 50);
      ASSERT_NE(p1, nullptr);
      VerifyNewHookWasCalled();
      free(p1);
      VerifyDeleteHookWasCalled();
    }

    // Windows has _aligned_malloc.  Let's test that that's captured too.
#if (defined(_MSC_VER) || defined(__MINGW32__)) && !defined(PERFTOOLS_NO_ALIGNED_MALLOC)
    p1 = noopt(_aligned_malloc)(sizeof(p1) * 2, 64);
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    _aligned_free(p1);
    VerifyDeleteHookWasCalled();
#endif

    p1 = noopt(valloc(60));
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    free(p1);
    VerifyDeleteHookWasCalled();

    p1 = noopt(pvalloc(70));
    ASSERT_NE(p1, nullptr);
    VerifyNewHookWasCalled();
    free(p1);
    VerifyDeleteHookWasCalled();

    char* p2 = noopt(new char);
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    delete p2;
    VerifyDeleteHookWasCalled();

    p2 = noopt(new char[100]);
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    delete[] p2;
    VerifyDeleteHookWasCalled();

    p2 = noopt(new (std::nothrow) char);
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    delete p2;
    VerifyDeleteHookWasCalled();

    p2 = noopt(new (std::nothrow) char[100]);
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    delete[] p2;
    VerifyDeleteHookWasCalled();

    // Another way of calling operator new
    p2 = noopt(static_cast<char*>(::operator new(100)));
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    ::operator delete(p2);
    VerifyDeleteHookWasCalled();

    // Try to call nothrow's delete too.  Compilers use this.
    p2 = noopt(static_cast<char*>(::operator new(100, std::nothrow)));
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    ::operator delete(p2, std::nothrow);
    VerifyDeleteHookWasCalled();

#ifdef ENABLE_SIZED_DELETE
    p2 = noopt(new char);
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    ::operator delete(p2, sizeof(char));
    VerifyDeleteHookWasCalled();

    p2 = noopt(new char[100]);
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    ::operator delete[](p2, sizeof(char) * 100);
    VerifyDeleteHookWasCalled();
#endif

    overaligned_type* poveraligned = noopt(new overaligned_type);
    ASSERT_NE(poveraligned, nullptr);
    ASSERT_EQ((((size_t)poveraligned) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    delete poveraligned;
    VerifyDeleteHookWasCalled();

    poveraligned = noopt(new overaligned_type[10]);
    ASSERT_NE(poveraligned, nullptr);
    ASSERT_EQ((((size_t)poveraligned) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    delete[] poveraligned;
    VerifyDeleteHookWasCalled();

    poveraligned = noopt(new(std::nothrow) overaligned_type);
    ASSERT_NE(poveraligned, nullptr);
    ASSERT_EQ((((size_t)poveraligned) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    delete poveraligned;
    VerifyDeleteHookWasCalled();

    poveraligned = noopt(new(std::nothrow) overaligned_type[10]);
    ASSERT_NE(poveraligned, nullptr);
    ASSERT_EQ((((size_t)poveraligned) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    delete[] poveraligned;
    VerifyDeleteHookWasCalled();

    // Another way of calling operator new
    p2 = noopt(static_cast<char*>(::operator new(100, std::align_val_t(kOveralignment))));
    ASSERT_NE(p2, nullptr);
    ASSERT_EQ((((size_t)p2) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    ::operator delete(p2, std::align_val_t(kOveralignment));
    VerifyDeleteHookWasCalled();

    p2 = noopt(static_cast<char*>(::operator new(100, std::align_val_t(kOveralignment), std::nothrow)));
    ASSERT_NE(p2, nullptr);
    ASSERT_EQ((((size_t)p2) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    ::operator delete(p2, std::align_val_t(kOveralignment), std::nothrow);
    VerifyDeleteHookWasCalled();

    poveraligned = noopt(new overaligned_type);
    ASSERT_NE(poveraligned, nullptr);
    ASSERT_EQ((((size_t)poveraligned) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    ::operator delete(poveraligned, sizeof(overaligned_type), std::align_val_t(kOveralignment));
    VerifyDeleteHookWasCalled();

    poveraligned = noopt(new overaligned_type[10]);
    ASSERT_NE(poveraligned, nullptr);
    ASSERT_EQ((((size_t)poveraligned) % kOveralignment), 0);
    VerifyNewHookWasCalled();
    ::operator delete[](poveraligned, sizeof(overaligned_type) * 10, std::align_val_t(kOveralignment));
    VerifyDeleteHookWasCalled();

// On AIX user defined malloc replacement of libc routines
// cannot be done at link time must be done a runtime via
// environment variable MALLOCTYPE
#if !defined(_AIX)
    // Try strdup(), which the system allocates but we must free.  If
    // all goes well, libc will use our malloc!
    p2 = noopt(strdup("in memory of James Golick"));
    ASSERT_NE(p2, nullptr);
    VerifyNewHookWasCalled();
    free(p2);
    VerifyDeleteHookWasCalled();
#endif
  }

  // Check that "lots" of memory can be allocated
  printf("Testing large allocation\n");
  {
    const int mb_to_allocate = 100;
    void* p = rnd.alloc(mb_to_allocate << 20);
    ASSERT_NE(p, nullptr);  // could not allocate
    free(p);
  }

  // Check calloc() with various arguments
  printf("Testing calloc\n");
  TestCalloc(0, 0, true);
  TestCalloc(0, 1, true);
  TestCalloc(1, 1, true);
  TestCalloc(1<<10, 0, true);
  TestCalloc(1<<20, 0, true);
  TestCalloc(0, 1<<10, true);
  TestCalloc(0, 1<<20, true);
  TestCalloc(1<<20, 2, true);
  TestCalloc(2, 1<<20, true);
  TestCalloc(1000, 1000, true);

  TestCalloc(kMaxSize, 2, false);
  TestCalloc(2, kMaxSize, false);
  TestCalloc(kMaxSize, kMaxSize, false);

  TestCalloc(kMaxSignedSize, 3, false);
  TestCalloc(3, kMaxSignedSize, false);
  TestCalloc(kMaxSignedSize, kMaxSignedSize, false);

  // Do the memory intensive tests after threads are done, since exhausting
  // the available address space can make pthread_create to fail.

  // Check that huge allocations fail with nullptr instead of crashing
  printf("Testing huge allocations\n");
  TestHugeAllocations(&rnd);

  // Check that large allocations fail with nullptr instead of crashing
  //
  // debug allocation takes forever for huge allocs
  if (!TestingPortal::Get()->IsDebuggingMalloc()) {
    constexpr NumericProperty kHeapLimitMB{"tcmalloc.heap_limit_mb"};
    printf("Testing out of memory\n");
    tcmalloc::Cleanup cleanup_limit = kHeapLimitMB.Override(1<<10); // 1 gig. Note, this is in megs.
    // Don't exercise more than 1 gig, no need to.
    for (int s = 0; ; s += (10<<20)) {
      void* large_object = rnd.alloc(s);
      if (large_object == nullptr) {
        break;
      }
      free(large_object);
    }
  }
}

TEST(TCMallocTest, EmergencyMalloc) {
  auto portal = TestingPortal::Get();
  if (!portal->HasEmergencyMalloc()) {
    printf("EmergencyMalloc test skipped\n");
    return;
  }

  SetNewHook();
  SetDeleteHook();
  tcmalloc::Cleanup unhook([] () {
    ResetNewHook();
    ResetDeleteHook();
  });

  void* p1 = noopt(tc_malloc)(32);
  void* p2 = nullptr;

  VerifyNewHookWasCalled();

  portal->WithEmergencyMallocEnabled([&] () {
    p2 = noopt(malloc)(32);
  });

  ASSERT_NE(p2, nullptr);

  // Emergency malloc doesn't call hook
  ASSERT_EQ(g_NewHook_calls, 0);

  // Emergency malloc pointers are recognized by MallocExtension::GetOwnership
  ASSERT_EQ(MallocExtension::instance()->GetOwnership(p1), MallocExtension::kOwned);
  ASSERT_EQ(MallocExtension::instance()->GetOwnership(p2), MallocExtension::kOwned);

  EXPECT_FALSE(portal->IsEmergencyPtr(p1));
  EXPECT_TRUE(portal->IsEmergencyPtr(p2));

  // Emergency malloc automagically does the right thing for free()
  // calls and doesn't invoke hooks.
  free(p2);
  ASSERT_EQ(g_DeleteHook_calls, 0);

  free(p1);
  VerifyDeleteHookWasCalled();
}

TEST(TCMallocTest, EmergencyMallocNoHook) {
  auto portal = TestingPortal::Get();
  if (!portal->HasEmergencyMalloc()) {
    printf("EmergencyMallocNoHook test skipped\n");
    return;
  }

  void* p1 = noopt(tc_malloc)(32);
  void* p2 = nullptr;
  void* p3 = nullptr;
  void* p4 = nullptr;

  portal->WithEmergencyMallocEnabled([&] () {
    p2 = noopt(malloc)(32);
    for (int i = 11; i < 999; i++) {
      tc_free(p3);
      p3 = tc_calloc(1, i);
    }
    p4 = tc_calloc(4096, 1024);
  });

  ASSERT_NE(p2, nullptr);
  ASSERT_NE(p3, nullptr);
  ASSERT_NE(p4, nullptr);

  // Emergency malloc pointers are recognized by MallocExtension::GetOwnership
  ASSERT_EQ(MallocExtension::instance()->GetOwnership(p1), MallocExtension::kOwned);
  ASSERT_EQ(MallocExtension::instance()->GetOwnership(p2), MallocExtension::kOwned);
  ASSERT_EQ(MallocExtension::instance()->GetOwnership(p3), MallocExtension::kOwned);
  ASSERT_EQ(MallocExtension::instance()->GetOwnership(p4), MallocExtension::kOwned);

  EXPECT_FALSE(portal->IsEmergencyPtr(p1));
  EXPECT_TRUE(portal->IsEmergencyPtr(p2));
  EXPECT_TRUE(portal->IsEmergencyPtr(p3));
  EXPECT_TRUE(portal->IsEmergencyPtr(p4));

  SetNewHook();
  SetDeleteHook();
  tcmalloc::Cleanup unhook([] () {
    ResetNewHook();
    ResetDeleteHook();
  });

  // Emergency malloc automagically does the right thing for free()
  // calls and doesn't invoke hooks.
  free(p4);
  free(p3);
  free(p2);
  ASSERT_EQ(g_DeleteHook_calls, 0);

  free(p1);
  VerifyDeleteHookWasCalled();
}

TEST(TCMallocTest, Version) {
  // Test tc_version()
  int major;
  int minor;
  const char* patch;
  char mmp[64];
  const char* human_version = tc_version(&major, &minor, &patch);
  int used = snprintf(mmp, sizeof(mmp), "gperftools %d.%d%s", major, minor, patch);
  ASSERT_LT(used, sizeof(mmp));
  ASSERT_EQ(strcmp(TC_VERSION_STRING, human_version), 0);
}

struct EnvProperty {
  const char* const name;
  constexpr EnvProperty(const char* name) : name(name) {}

  std::string_view Get() const {
    const char* v = getenv(name);
    if (v == nullptr) {
      return {};
    }
    return {v};
  }

  using override_set = std::vector<std::pair<std::string, std::string>>;
  using env_override_fn = std::function<void(override_set*)>;

  static std::vector<const char*> DuplicateAndUpdateEnv(env_override_fn fn) {
    override_set overrides;
    fn(&overrides);
    return DoDuplicateAndUpdateEnv(std::move(overrides));
  }

  static std::vector<const char*> DoDuplicateAndUpdateEnv(override_set overrides) {
    std::vector<const char*> vec;

    for (const char* const *p = environ; *p; p++) {
      std::string_view k_and_v{*p};
      auto pos = k_and_v.find('=');
      CHECK(pos != std::string_view::npos);
      std::string_view k = k_and_v.substr(0, pos);
      int i = overrides.size() - 1;;
      for (; i >= 0; i--) {
        if (overrides[i].first == k) {
          break;
        }
      }
      if (i < 0) {
        vec.push_back(*p);
      }
    }

    for (const auto& [k, v] : overrides) {
      if (v.empty()) {
        continue;
      }

      size_t sz = k.size() + v.size() + 1 + 1;
      char* new_k_and_v = new char[sz];
      auto it = std::copy(k.begin(), k.end(), new_k_and_v);
      *it++ = '=';
      it = std::copy(v.begin(), v.end(), it);
      *it++ = '\0';
      CHECK_EQ(it, new_k_and_v + sz);

      vec.push_back(new_k_and_v);
    }

    vec.push_back(nullptr);

    return vec;
  }

  void Set(override_set* overrides, const char* new_value) const {
    overrides->emplace_back(std::string(name), std::string(new_value));
  }
  void SetAndPrint(override_set* overrides, const char* new_value) const {
    printf("Testing %s=%s\n", name, new_value);
    return Set(overrides, new_value);
  }
};

static const char* argv0; // set in HandleVariableRuns

#ifndef _WIN32
// Everything non-windows we assume sufficiently POSIX-ish
static void ReSpawnWithEnv(EnvProperty::env_override_fn env_override) {
  std::vector<const char*> env = EnvProperty::DuplicateAndUpdateEnv(env_override);
  char * const child_argv[] = {const_cast<char*>(argv0), nullptr};
  pid_t pid;
  int rv = posix_spawn(&pid, argv0, nullptr, nullptr, child_argv, const_cast<char**>(env.data()));
  if (rv != 0) {
    errno = rv;
    perror("posix_spawn");
    abort();
  }

  // parent
  int status = -1;
  pid_t wait_rv;
  do {
    wait_rv = waitpid(pid, &status, 0);
  } while (wait_rv < 0 && errno == EINTR);

  if (wait_rv < 0) {
    perror("waitpid");
    abort();
  }

  CHECK_EQ(wait_rv, pid);
  int exit_status = WEXITSTATUS(status);
  if (!WIFEXITED(status) || exit_status != 0) {
    printf("sub-process run failed with status = %d.\n", status);
    if (WIFEXITED(status)) {
      exit(exit_status);
    }
    exit(1);
  }
}
#else
// Windows spawning codes
static void ReSpawnWithEnv(EnvProperty::env_override_fn env_override) {
  std::vector<const char*> env = EnvProperty::DuplicateAndUpdateEnv(env_override);

  // For windows CreateProcessA environment needs to be converted to
  // environment block. Which is just a successive ASCIIZ strings
  // terminated by \0 (blank string). So we convert our vector
  // environment entries to this format.
  env.pop_back(); // last element is nullptr

  std::vector<std::string_view> env_views;
  env_views.reserve(env.size());
  size_t total_size = 0;
  for (const char* s : env) {
    env_views.push_back(s);
    total_size += env_views.rbegin()->size() + 1;
  }
  total_size++; // account for final empty string

  std::unique_ptr<char[]> env_block = std::make_unique<char[]>(total_size);
  char* env_block_p = env_block.get();
  for (std::string_view s : env_views) {
    env_block_p = std::copy(s.begin(), s.end(), env_block_p);
    *env_block_p++ = '\0';
  }
  *env_block_p++ = '\0';
  CHECK_EQ(env_block_p, &(env_block[total_size]));

  fflush(stdout);
  fflush(stderr);

  STARTUPINFOA si;
  PROCESS_INFORMATION pi;

  memset(&si, 0, sizeof(si));
  si.cb = sizeof(si);
  memset(&pi, 0, sizeof(pi));

  if (!CreateProcessA(argv0,
                      nullptr, // command line. nullptr implies just argv0
                      nullptr, // process attributes
                      nullptr, // thread attributes
                      TRUE,    // InheritHandles
                      0,       // creation flags
                      env_block.get(),
                      nullptr, // current directory
                      &si,
                      &pi)) {
    printf("CreateProcessA failed with error code: %x\n", (unsigned)GetLastError());
    abort();
  }

  WaitForSingleObject(pi.hProcess, INFINITE);

  DWORD exit_code;
  GetExitCodeProcess(pi.hProcess, &exit_code);

  CloseHandle(pi.hProcess);
  CloseHandle(pi.hThread);

  if (exit_code != 0) {
    printf("sub-process run failed with status = %d\n", (int)exit_code);
    exit((int)exit_code);
  }
}
#endif // _WIN32

// We want to run tests with several runtime configuration tweaks. For
// improved test coverage. Previously we had shell script driving
// this, now we handle this by exec-ing just at the end of all tests.
//
// Do note, though, that this logic is only activated if test program
// is run with no args. I.e. if you're debugging specific unit-test(s)
// by passing --gtest_filter or other flags, you'll need to set up
// environment variables yourself. See SetupExec below.
//
// We test 4 extra settings:
//
// * TCMALLOC_TRANSFER_NUM_OBJ = 40
//
// * TCMALLOC_TRANSFER_NUM_OBJ = 4096
//
// * TCMALLOC_AGGRESSIVE_DECOMMIT = t
//
// * TCMALLOC_HEAP_LIMIT_MB = 512
//
// * TCMALLOC_ENABLE_SIZED_DELETE = t (note, this one is no-op in most
//     common builds)
void HandleVariableRuns(int argc, char** argv) {
  if (argc != 1) {
    return;
  }

  argv0 = argv[0];

  static constexpr EnvProperty kMarker{"TCMALLOC_UNITTEST_MARKER"};
  static constexpr EnvProperty kTransferNumObjEnv{"TCMALLOC_TRANSFER_NUM_OBJ"};
  static constexpr EnvProperty kAggressiveDecommitEnv{"TCMALLOC_AGGRESSIVE_DECOMMIT"};
  static constexpr EnvProperty kHeapLimitEnv{"TCMALLOC_HEAP_LIMIT_MB"};
  static constexpr EnvProperty kEnableSizedDeleteEnv{"TCMALLOC_ENABLE_SIZED_DELETE"};

  if (!kMarker.Get().empty()) {
    return; // We're unitttest child
  }

  using override_set = EnvProperty::override_set;

  ReSpawnWithEnv([] (override_set* overrides) {
    kMarker.Set(overrides, "_");
  });

  ReSpawnWithEnv([] (override_set* overrides) {
    kTransferNumObjEnv.SetAndPrint(overrides, "40");
    kMarker.Set(overrides, "_");
  });

  ReSpawnWithEnv([] (override_set* overrides) {
    kTransferNumObjEnv.SetAndPrint(overrides, "4096");
    kMarker.Set(overrides, "_");
  });

  ReSpawnWithEnv([] (override_set* overrides) {
    kTransferNumObjEnv.Set(overrides, "");
    kAggressiveDecommitEnv.SetAndPrint(overrides, "t");
    kMarker.Set(overrides, "_");
  });

  ReSpawnWithEnv([] (override_set* overrides) {
    kAggressiveDecommitEnv.Set(overrides, "");
    kHeapLimitEnv.SetAndPrint(overrides, "512");
    kMarker.Set(overrides, "_");
  });

  ReSpawnWithEnv([] (override_set* overrides) {
    kHeapLimitEnv.Set(overrides, "");
    kEnableSizedDeleteEnv.SetAndPrint(overrides, "t");
    kMarker.Set(overrides, "_");
  });

  exit(0);
}

#ifdef HAVE_FORK_TESTING_SUPPORT
namespace fork_torture {

// Fork torture testing.
//
// Basic idea is to enable x86 single-stepping mode. And have signal
// handler for SIGTRAP wake up a helper thread. That helper thread
// forks and runs some malloc activities in the child.
//
// We also setup cpu mask with exactly one cpu and have helper thread
// on real-time scheduling policy. This ensures that whenever helper
// thread runs forking, we can unblock main thread, but main thread
// will only run when helper thread is blocked on some lock.
//
// Intended outcome is to exercise fork in multithreaded programs on
// roughly every possible opportunity.
//
// We also add a small optimization of only really stopping on
// instructions immediately after instruction with LOCK
// prefix. I.e. after some locking operation is complete.
//
// This is Linux- and x86-64-specific for simplicity.

// single_step_req is waited by the helper thread and posted by main
// thread from single-step signal handler.
sem_t single_step_req;
// single_step_ack is waited by the main thread and posted by the
// helper thread.
sem_t single_step_ack;

// in_fork is a flag set iff helper thread is running the forking activity.
bool in_fork;

// These 2 flags are helping us make sure we're actually done forking
// at the end of test runner.
bool stepping_stop_requested;
bool stepping_stop_acked;

uint64_t num_forks;

void xsem_wait(sem_t* sem) {
  while (sem_wait(sem) < 0) {
    CHECK(errno == EINTR);
  }
}

constexpr uintptr_t kTF = 0x100; // Trace flag in x86 FLAGS register.

bool try_handle_sigtrap_blocking(uint8_t* at_rip, ucontext_t* uc);

void step_handler(int signo, siginfo_t* si, void* _uc) {
  ucontext_t* uc = static_cast<ucontext_t*>(_uc);
  auto at_rip = reinterpret_cast<uint8_t*>(uc->uc_mcontext.gregs[REG_RIP]);

  if (stepping_stop_requested) {
    uc->uc_mcontext.gregs[REG_EFL] &= ~kTF;
    while (in_fork) {
      (void)*const_cast<volatile bool*>(&in_fork);
    }
    stepping_stop_acked = true;
    return;
  }

  if (try_handle_sigtrap_blocking(at_rip, uc)) {
    return;
  }

  if (in_fork) {
    return;
  }

  // Add TF to flags and request SIGTRAP on every instruction in this
  // thread. We could do it only once, but it is harmless to do it
  // always.
  uc->uc_mcontext.gregs[REG_EFL] |= kTF;

  static bool last_was_lock;

  if (!last_was_lock) {
    if (*at_rip == 0xf0) { // lock prefix.
      last_was_lock = true;
    }
    return;
  }

  last_was_lock = false;

  int errno_save = errno;

  (void)sem_post(&single_step_req);
  xsem_wait(&single_step_ack);

  errno = errno_save;
}

bool try_handle_sigtrap_blocking(uint8_t* at_rip, ucontext_t* uc) {
  if (at_rip[0] != 0x0f || at_rip[1] != 0x05) {
    return false;
  }

  // syscall instruction. Lets check if someone is about to block
  // SIGTRAP. If so we must turn off single-stepping, because
  // otherwise blocked SIGTRAP and pending single-stepping will kill
  // the process.

  auto& regs = uc->uc_mcontext.gregs;
  if (regs[REG_RAX] != SYS_rt_sigprocmask) {
    return false;
  }
  if (regs[REG_RDI] != SIG_SETMASK && regs[REG_RDI] != SIG_BLOCK) {
    return false;
  }
  sigset_t* newmask = reinterpret_cast<sigset_t*>(regs[REG_RSI]);
  if (!newmask || !sigismember(newmask, SIGTRAP)) {
    return false;
  }

  // okay, once we detected this case, we drop single-stepping
  // flag, block SIGTRAP and raise it. So that when SIGTRAP is
  // eventually unblocked, we'll get back to signal hander and
  // re-set single-stepping back.
  regs[REG_EFL] &= ~kTF;
  raise(SIGTRAP);
  sigset_t* oldmask = reinterpret_cast<sigset_t*>(regs[REG_RDX]);
  if (oldmask) {
    *oldmask = uc->uc_sigmask;
    regs[REG_RDX] = 0; // handle "get old mask" part, so we can block
                       // our signal
  }
  sigaddset(&uc->uc_sigmask, SIGTRAP);

  return true;
}

tcmalloc::Cleanup<std::function<void()>> setup_fork_testing(int* argc, char *** argv) {
  if (*argc < 2 || (*argv)[1] != std::string("--with-fork-torture")) {
    printf("Not enabling fork torture\n");
    return tcmalloc::Cleanup(std::function<void()>([] () {}));
  }
  printf("Enabling fork torturing!!!!\n");

  CHECK(sem_init(&single_step_req, 0, 0) == 0);
  CHECK(sem_init(&single_step_ack, 0, 0) == 0);

  // First, we set cpu affinity mask to only core 0. It helps
  // performance, but mostly it is required so that main thread never
  // runs when real-time helper thread is runnable.
  {
    cpu_set_t mask;
    memset(&mask, 0, sizeof(mask));
    CPU_SET(0, &mask);
    CHECK(sched_setaffinity(0, sizeof(mask), &mask) == 0);
  }

  // Then we prepare SIGTRAP signal handler.
  {
    struct sigaction sa;
    memset(&sa, 0, sizeof(sa));
    sa.sa_sigaction = step_handler;
    sa.sa_flags = SA_RESTART | SA_SIGINFO;
    CHECK(sigaction(SIGTRAP, &sa, nullptr) == 0);
  }

  std::thread* t = new std::thread([] () {
    // Helper thread first makes itself real-time.
    struct sched_param p;
    memset(&p, 0, sizeof(p));
    p.sched_priority = 1;
    CHECK(sched_setscheduler(0, SCHED_FIFO, &p) == 0);

    // And then signals its readiness.
    sem_post(&single_step_ack);

    MallocExtension::instance()->MarkThreadIdle();

    constexpr int kPeriod = 1 << 10;
    int cnt = kPeriod;

    while (true) {
      xsem_wait(&single_step_req);
      // Lets print something every few iterations to help us see if
      // progress is being made.
      if (--cnt <= 0) {
        write(2, "$", 1);
        cnt = kPeriod;
      }

      // Once we're about to fork, we need to flag "in_fork" mode and
      // unblock main thread.
      in_fork = true;
      sem_post(&single_step_ack);

      int child = fork();
      CHECK(child >= 0);
      if (child == 0) {
        // Child runs some mallocs and exits.
        (::operator delete)((::operator new)(32));
        (::operator delete)((::operator new)(1024));
        (::operator delete)((::operator new)(2 << 20));
        _exit(0);
      }

      // Parent asserts that child exited cleanly.
      int status = 0;
      int ret = waitpid(child, &status, 0);
      CHECK(ret == child);
      CHECK(status == 0);

      // And we un-mark in_fork mode, so that main thread continues to
      // cooperation via sem_{post/wait} on single_step_{req,ack}
      // semaphores.
      num_forks++;
      in_fork = false;
    }
  });
  (void)t; // leak
  xsem_wait(&single_step_ack);

  MallocExtension::instance()->MarkThreadIdle();

  // First SIGTRAP runs the signal handler and signal handler sets up
  // EFLAGS to single-step.
  raise(SIGTRAP);

  // This is a flag for a test that is not compatible with
  // single-stepping. NewHandler test doesn't work because it enables
  // oom simulation at some point which, naturally, crashes the forked
  // child.
  running_fork_testing = true;

  return tcmalloc::Cleanup(std::function<void()>([] () {
    stepping_stop_requested = true;
    while (!*const_cast<volatile bool*>(&stepping_stop_acked)) {
      // no-op
    }
    // In the clean up, we're ensuring that in_fork turns to false, so
    // that fork/waitpid isn't stuck.
    printf("Done with fork torturing! Number of forks performed: %lld\n", (long long)num_forks);
  }));
}
}  // namespace fork_torture

using fork_torture::setup_fork_testing;

#else  // HAVE_FORK_TESTING_SUPPORT

int setup_fork_testing(int* argc, char *** argv) {return 0;}

#endif  // !HAVE_FORK_TESTING_SUPPORT

int main(int argc, char** argv) {
  HandleVariableRuns(argc, argv);

  if (TestingPortal::Get()->IsDebuggingMalloc()) {
    // return freed blocks to tcmalloc immediately
    TestingPortal::Get()->GetMaxFreeQueueSize() = 0;
  }

#if defined(__linux) || defined(_WIN32)
  // We know that Linux and Windows have functional memory releasing
  // support. So don't let us degrade on that.
  if (!getenv("DONT_TEST_SYSTEM_RELEASE")) {
    CHECK(TestingPortal::Get()->HaveSystemRelease());
  }
#endif

  testing::InitGoogleTest(&argc, argv);

  auto fork_cleanup = setup_fork_testing(&argc, &argv);
  (void)fork_cleanup;

  int err_code = RUN_ALL_TESTS();
  if (err_code) {
    return err_code;
  }
}
